import requests
import re
import time
from lxml import etree
x=0
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48"
}
flag= 0 #由于index从空开始故设置此标识
for number in range(0,10):
    if flag == 0:
        url = f"https://jiaowu.web.hebust.edu.cn/tzgg/index.htm"
    else:
        url = f"https://jiaowu.web.hebust.edu.cn/tzgg/index{number}.htm"
    #第一次是空，第二页以后是从1开始的故用此判断
    response = requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    #经典解码格式
    page_text = response.text
    p = re.compile(r'"(.*?)" target="_blank" title=(.*?)>')
    text = p.findall(page_text)
    #此处利用正则获取url及标题
    for content in text:
        url = 'https://jiaowu.web.hebust.edu.cn/tzgg/'+content[0]
        title = content[1]
        xinxi = "标题="+title+'\n'+"链接="+url
        with open('./information.txt', 'a', encoding='utf-8') as fp:
            fp.write(xinxi + '\n')
        news_response = requests.get(url=url,headers=headers)
        news_response.encoding = 'utf-8'
        news_text = news_response.text
        k = etree.HTML(news_text)
        zhengwen = k.xpath('/html/body/article/div/aside/div[2]/div[2]//text()')
        #xpath获取内容
        for zhengwen in zhengwen[1::2]:
            with open('./information.txt','a',encoding='utf-8') as fp:
                fp.write(zhengwen+'\n')
        x+=1
        print(f'第{x}篇文章已获取完毕,文章标题为{title}')
        time.sleep(3)
        #延时功能
    flag = 1